Day 2 Session 1:
Dynamic spectral analysis
using Functional Principal Component Analysis (FPCA)

Introduction

Principal Component Analysis (PCA)

Preliminaries

Installing/loading packages

# installing packages
# install.packages("tidyverse")
# install.packages("mgcv")
# install.packages("itsadug")
# install.packages("tidymv")
# install.packages("tidygam")

# importing packages
library(tidyverse)
library(fdapace)

Importing data set

Let’s import the data set. We are using the data set openly available on the Open Science Framework (OSF) repository.

# import the csv file "initial.liquid.dynamic.csv" from the "data" directory and save it as "df_dyn"
df_dyn <- readr::read_csv("data/initial.liquid.dynamic.csv")

Data wrangling

Check data

We always start with inspecting the data set using colnames().

# Let's check what columns the data frame contains
colnames(df_dyn)
##  [1] "...1"           "file"           "speaker"        "language"      
##  [5] "f0"             "duration"       "segment"        "previous_sound"
##  [9] "next_sound"     "word"           "f1"             "f2"            
## [13] "f3"             "time"           "IsApproximant"  "IsAcoustic"    
## [17] "note"           "gender"         "omit"           "Barkf1"        
## [21] "Barkf2"         "Barkf3"         "f2f1"           "f3f2"          
## [25] "Barkf2f1"       "Barkf3f2"       "position"       "context"       
## [29] "liquid"         "input_file"

Omitting irrelavent columns

We’ll omit the columns we don’t need.

# Let's check the number of "approximant" tokens
df_dyn |> 
  dplyr::group_by(IsApproximant) |> 
  dplyr::summarise() |> 
  dplyr::ungroup()
## # A tibble: 1 × 1
##   IsApproximant
##   <chr>        
## 1 yes
# Let's check the number of tokens of good recording quality
df_dyn |> 
  dplyr::group_by(IsAcoustic) |> 
  dplyr::summarise() |> 
  dplyr::ungroup()
## # A tibble: 1 × 1
##   IsAcoustic
##   <chr>     
## 1 yes
# How about 'omit'?
df_dyn |> 
  dplyr::group_by(omit) |> 
  dplyr::summarise() |> 
  dplyr::ungroup()
## # A tibble: 1 × 1
##    omit
##   <dbl>
## 1     0
# Remove columns that we no longer need
df_dyn <- df_dyn |> 
  dplyr::select(-c(IsApproximant, IsAcoustic, omit, Barkf1, Barkf2, Barkf3, Barkf2f1, Barkf3f2, f2f1, f3f2))

Let’s check the column names again.

colnames(df_dyn)
##  [1] "...1"           "file"           "speaker"        "language"      
##  [5] "f0"             "duration"       "segment"        "previous_sound"
##  [9] "next_sound"     "word"           "f1"             "f2"            
## [13] "f3"             "time"           "note"           "gender"        
## [17] "position"       "context"        "liquid"         "input_file"

Let’s also convert the context column into IPA symbols for a more intuitive representation:

# convert the ARPABET notation into IPA symbols
df_dyn <- df_dyn |> 
  dplyr::mutate(
    context = case_when(
      context == "AE" ~ "/æ/",
      context == "IY" ~ "/i/",
      context == "UW" ~ "/u/"
    )
  )

Checking the number of participants, tokens…

Let’s also obtain some descriptive statistics here. Note that we need to divide the number of rows by 11 to obtain the accurate number of tokens, as one token now has 11 time points.

# number of participants
df_dyn |> 
  dplyr::group_by(language) |> 
  dplyr::summarise(n = n_distinct(speaker)) |> 
  dplyr::ungroup()
## # A tibble: 2 × 2
##   language     n
##   <chr>    <int>
## 1 English     14
## 2 Japanese    31
# number of tokens per segment
df_dyn |> 
  dplyr::group_by(segment) |> 
  dplyr::summarise(n = n()/11) |> # divide by 11 time points
  dplyr::ungroup()
## # A tibble: 6 × 2
##   segment     n
##   <chr>   <dbl>
## 1 LAE1      511
## 2 LIY1      408
## 3 LUW1      299
## 4 RAE1      502
## 5 RIY1      481
## 6 RUW1      314

Data visualisation

Scaling formant frequencies

Do you remember how to visualise the dynamic data? The basic procedure is the same as in the static analysis; We first apply z-score normalisation to the formant frequencies to make sure that formant values are comparable across speakers.

df_dyn <- df_dyn |> 
  dplyr::group_by(speaker) |> # tell R to do the following iteration per speaker
  dplyr::mutate(
    f1z = as.numeric(scale(f1)), # scale f1 into z-score
    f2z = as.numeric(scale(f2)), # scale f2 into z-score
    f3z = as.numeric(scale(f3)) # scale f3 into z-score
  ) |> 
  dplyr::ungroup() # don't forget ungrouping

Descriptive statistics

Let’s check the mean and SD for both raw and normalised formant values: just see F1 for now. Note that the mean z-scores do not seem to look zero, but this is because computers are not very good at dealing with very small numbers (e.g., decimals) and some fluctuations occur in computing the values.

# check mean and sd of raw/scaled F1 values for each speaker
df_dyn |> 
  dplyr::group_by(speaker) |>
  dplyr::summarise(
    f1_mean = mean(f1),
    f1_sd = sd(f1),
    f1z_mean = mean(f1z),
    f1z_sd = sd(f1z)
  ) |> 
  dplyr::ungroup() 
## # A tibble: 45 × 5
##    speaker f1_mean f1_sd  f1z_mean f1z_sd
##    <chr>     <dbl> <dbl>     <dbl>  <dbl>
##  1 2d57ke     468.  140. -9.05e-17   1   
##  2 2drb3c     575.  243.  2.39e-16   1   
##  3 2zy9tf     459.  228. -2.41e-16   1   
##  4 3bcpyh     438.  142. -5.72e-18   1.00
##  5 3hsubn     537.  177.  6.11e-17   1   
##  6 3pzrts     458.  195. -1.44e-18   1.00
##  7 4ps8zx     467.  172.  2.19e-16   1   
##  8 54i2ks     453.  192.  1.43e-16   1.00
##  9 5jzj2h     412.  133.  2.49e-16   1.00
## 10 5upwe3     444.  189. -6.51e-17   1.00
## # ℹ 35 more rows

Visualisation

raw trajectories

Let’s visualise the raw data first:

# F2 - raw trajectories
df_dyn |> 
  ggplot(aes(x = time, y = f2z)) +
  geom_point(aes(colour = language, group = file), width = 0.3, alpha = 0.4) +
  geom_path(aes(colour = language, group = file), width = 0.3, alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5) +
  scale_colour_manual(values = cbPalette) + 
  facet_grid(liquid ~ context) +
  labs(x = "time", y = "F2 (z-normalised)", title = "time-varying change in F2 frequency") +
  theme(strip.text.y = element_text(angle = 0))

smooths

Let’s also try plotting smoothed trajectories:

# F2 - smooths
df_dyn |> 
  ggplot(aes(x = time, y = f2z)) +
  # geom_point(aes(colour = language, group = file), width = 0.3, alpha = 0.1) +
  # geom_path(aes(colour = language, group = file), width = 0.3, alpha = 0.1) +
  geom_smooth(aes(colour = language, group = language), linewidth = 1.2, se = TRUE) +
  geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5) +
  scale_colour_manual(values = cbPalette) + 
  facet_grid(liquid ~ context) +
  labs(x = "time", y = "F2 (z-normalised)", title = "smoothed time-varying change in F2 frequency") +
  theme(strip.text.y = element_text(angle = 0))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Functional Principal Component Analysis (FPCA)

At this point, we tried fitting Generalised Additive Mixed-Effect Models (GAMMs) to investigate between-group differences in F2 dynamics. This was possible because we had some ideas of possible factors (e.g., vowel context, speaker groups). In a way, lots of decisions were made in a top-down manner.

We could also try another approach; a bottom-up approach. That is, does the data tell us anything about higher-order groupings? Do the vowel context/speaker group differences emerge from the data?

Here, let’s try Functional Principal Component Analysis (FPCA) as a bottom-up approach.

# IDs = token column; tVec = time column; yVec = variable column(s)
input_df <- fdapace::MakeFPCAInputs(IDs = df_dyn$file, tVec = df_dyn$time, yVec = df_dyn$f2z)

# Check if there's any issues with the data
fdapace::CheckData(input_df$Ly, input_df$Lt)

# No errors have been returned, so let's now run fPCA on the dynamic PC1 trajectory
df_dyn_fpca <- fdapace::FPCA(Ly = input_df$Ly, input_df$Lt, optns = list(plot = TRUE))

checking fpca results

# eigenvalues
df_dyn_fpca$lambda
## [1] 73.431312 13.589246  4.373711  1.285172
# the cumulative percentage of variance explained by the eigenvalue
df_dyn_fpca$cumFVE
## [1] 0.7859692 0.9314212 0.9782350 0.9919908
# list of sampling time
df_dyn_fpca$workGrid
##  [1]   0  10  20  30  40  50  60  70  80  90 100
# PC scores -> each row is 1 token, each column is one PC
head(df_dyn_fpca$xiEst)
##            [,1]     [,2]       [,3]        [,4]
## [1,]  1.3978292 5.260927  2.1324070 -2.52256410
## [2,] -1.5626684 5.595501  0.2863422 -1.84886236
## [3,] -0.6166366 4.291694  1.1572724 -1.35058648
## [4,] -5.5315817 2.679823  0.1415474  0.08740263
## [5,]  0.5428989 3.542994  3.8331163 -2.47430535
## [6,] -2.7433146 7.071773 -2.1509403 -1.88748022
# Mean curve
fdapace::GetMeanCurve(Ly = input_df$Ly, Lt = input_df$Lt, optns = list(plot = TRUE))

## $mu
##  [1] -0.70831425 -0.56725924 -0.32739985 -0.04873545  0.18428169  0.31887341
##  [7]  0.37257545  0.37071962  0.31025622  0.17889910 -0.08389668
## 
## $workGrid
##  [1]   0  10  20  30  40  50  60  70  80  90 100
## 
## $bwMu
## NULL
## 
## $optns
## $optns$userBwMu
## [1] 5
## 
## $optns$methodBwMu
## [1] "Default"
## 
## $optns$userBwCov
## [1] 10
## 
## $optns$methodBwCov
## [1] "Default"
## 
## $optns$kFoldMuCov
## [1] 10
## 
## $optns$methodSelectK
## [1] "FVE"
## 
## $optns$FVEthreshold
## [1] 0.99
## 
## $optns$FVEfittedCov
## NULL
## 
## $optns$fitEigenValues
## [1] FALSE
## 
## $optns$maxK
## [1] 9
## 
## $optns$dataType
## [1] "Dense"
## 
## $optns$error
## [1] TRUE
## 
## $optns$shrink
## [1] FALSE
## 
## $optns$nRegGrid
## [1] 11
## 
## $optns$rotationCut
## [1] 0.25 0.75
## 
## $optns$methodXi
## [1] "IN"
## 
## $optns$kernel
## [1] "epan"
## 
## $optns$lean
## [1] FALSE
## 
## $optns$diagnosticsPlot
## NULL
## 
## $optns$plot
## [1] TRUE
## 
## $optns$numBins
## NULL
## 
## $optns$useBinnedCov
## [1] TRUE
## 
## $optns$usergrid
## [1] FALSE
## 
## $optns$yname
## [1] "Ly"
## 
## $optns$methodRho
## [1] "vanilla"
## 
## $optns$verbose
## [1] FALSE
## 
## $optns$userMu
## NULL
## 
## $optns$userCov
## NULL
## 
## $optns$methodMuCovEst
## [1] "cross-sectional"
## 
## $optns$userRho
## NULL
## 
## $optns$userSigma2
## NULL
## 
## $optns$outPercent
## [1] 0 1
## 
## $optns$useBinnedData
## [1] "AUTO"
## 
## $optns$useBW1SE
## [1] FALSE
## 
## $optns$imputeScores
## [1] TRUE
# plot
plot(df_dyn_fpca)

# scree plot
CreateScreePlot(df_dyn_fpca)

# path plot
CreatePathPlot(df_dyn_fpca, xlab = "normalised time", ylab = "F2 (z-normalised)")

# function: get PC scores + return data frame with PCs for each token
get_pc_scores <- function(fpcaObj){
  pcs <- data.frame(fpcaObj$xiEst)
  token <- names(fpcaObj$inputData$Lt) 
  df <- cbind(token, pcs)
  n_pcs <- length(fpcaObj$lambda) # get number of PCs
  pc_names <- paste0("PC", 1:n_pcs) # create colnames for PCs
  names(df) <- c("file", pc_names) # add colnames for token + PCs
  return(df)
}

# get PC scores w/ token info
pc_df <- get_pc_scores(df_dyn_fpca)

# join PCs (dat) with selected cols from original data frame 
## store meta info
meta <- df_dyn |>  
  select(speaker, gender, language, word, liquid, context, file)

## merge the list and meta data - unique(meta) because otherwise there would be lots of duplicates
dat <- left_join(pc_df, unique(meta), by = "file")